{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Porto Seguro: balancing samples in mini-batches with Keras\n\nThis example compares two strategies to train a neural-network on the Porto\nSeguro Kaggle data set [1]_. The data set is imbalanced and we show that\nbalancing each mini-batch allows to improve performance and reduce the training\ntime.\n\n## References\n\n.. [1] https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>\n# License: MIT\n\nprint(__doc__)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Data loading\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "from collections import Counter\nimport pandas as pd\nimport numpy as np"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "First, you should download the Porto Seguro data set from Kaggle. See the\nlink in the introduction.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "training_data = pd.read_csv(\"./input/train.csv\")\ntesting_data = pd.read_csv(\"./input/test.csv\")\n\ny_train = training_data[[\"id\", \"target\"]].set_index(\"id\")\nX_train = training_data.drop([\"target\"], axis=1).set_index(\"id\")\nX_test = testing_data.set_index(\"id\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "The data set is imbalanced and it will have an effect on the fitting.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "print(f\"The data set is imbalanced: {Counter(y_train['target'])}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Define the pre-processing pipeline\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "from sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.preprocessing import FunctionTransformer\nfrom sklearn.impute import SimpleImputer\n\n\ndef convert_float64(X):\n    return X.astype(np.float64)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "We want to standard scale the numerical features while we want to one-hot\nencode the categorical features. In this regard, we make use of the\n:class:`~sklearn.compose.ColumnTransformer`.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "numerical_columns = [\n    name for name in X_train.columns if \"_calc_\" in name and \"_bin\" not in name\n]\nnumerical_pipeline = make_pipeline(\n    FunctionTransformer(func=convert_float64, validate=False), StandardScaler()\n)\n\ncategorical_columns = [name for name in X_train.columns if \"_cat\" in name]\ncategorical_pipeline = make_pipeline(\n    SimpleImputer(missing_values=-1, strategy=\"most_frequent\"),\n    OneHotEncoder(categories=\"auto\"),\n)\n\npreprocessor = ColumnTransformer(\n    [\n        (\"numerical_preprocessing\", numerical_pipeline, numerical_columns),\n        (\n            \"categorical_preprocessing\",\n            categorical_pipeline,\n            categorical_columns,\n        ),\n    ],\n    remainder=\"drop\",\n)\n\n# Create an environment variable to avoid using the GPU. This can be changed.\nimport os\n\nos.environ[\"CUDA_VISIBLE_DEVICES\"] = \"-1\""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Create a neural-network\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "from tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import (\n    Activation,\n    Dense,\n    Dropout,\n    BatchNormalization,\n)\n\n\ndef make_model(n_features):\n    model = Sequential()\n    model.add(Dense(200, input_shape=(n_features,), kernel_initializer=\"glorot_normal\"))\n    model.add(BatchNormalization())\n    model.add(Activation(\"relu\"))\n    model.add(Dropout(0.5))\n    model.add(Dense(100, kernel_initializer=\"glorot_normal\", use_bias=False))\n    model.add(BatchNormalization())\n    model.add(Activation(\"relu\"))\n    model.add(Dropout(0.25))\n    model.add(Dense(50, kernel_initializer=\"glorot_normal\", use_bias=False))\n    model.add(BatchNormalization())\n    model.add(Activation(\"relu\"))\n    model.add(Dropout(0.15))\n    model.add(Dense(25, kernel_initializer=\"glorot_normal\", use_bias=False))\n    model.add(BatchNormalization())\n    model.add(Activation(\"relu\"))\n    model.add(Dropout(0.1))\n    model.add(Dense(1, activation=\"sigmoid\"))\n\n    model.compile(loss=\"binary_crossentropy\", optimizer=\"adam\", metrics=[\"accuracy\"])\n\n    return model"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "We create a decorator to report the computation time\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import time\nfrom functools import wraps\n\n\ndef timeit(f):\n    @wraps(f)\n    def wrapper(*args, **kwds):\n        start_time = time.time()\n        result = f(*args, **kwds)\n        elapsed_time = time.time() - start_time\n        print(f\"Elapsed computation time: {elapsed_time:.3f} secs\")\n        return (elapsed_time, result)\n\n    return wrapper"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "The first model will be trained using the ``fit`` method and with imbalanced\nmini-batches.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import roc_auc_score\n\n\n@timeit\ndef fit_predict_imbalanced_model(X_train, y_train, X_test, y_test):\n    model = make_model(X_train.shape[1])\n    model.fit(X_train, y_train, epochs=2, verbose=1, batch_size=1000)\n    y_pred = model.predict_proba(X_test, batch_size=1000)\n    return roc_auc_score(y_test, y_pred)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "In the contrary, we will use imbalanced-learn to create a generator of\nmini-batches which will yield balanced mini-batches.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "from imblearn.keras import BalancedBatchGenerator\n\n\n@timeit\ndef fit_predict_balanced_model(X_train, y_train, X_test, y_test):\n    model = make_model(X_train.shape[1])\n    training_generator = BalancedBatchGenerator(\n        X_train, y_train, batch_size=1000, random_state=42\n    )\n    model.fit(training_generator, epochs=5, verbose=1)\n    y_pred = model.predict(X_test, batch_size=1000)\n    return roc_auc_score(y_test, y_pred)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Classification loop\n\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "We will perform a 10-fold cross-validation and train the neural-network with\nthe two different strategies previously presented.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "from sklearn.model_selection import StratifiedKFold\n\nskf = StratifiedKFold(n_splits=10)\n\ncv_results_imbalanced = []\ncv_time_imbalanced = []\ncv_results_balanced = []\ncv_time_balanced = []\nfor train_idx, valid_idx in skf.split(X_train, y_train):\n    X_local_train = preprocessor.fit_transform(X_train.iloc[train_idx])\n    y_local_train = y_train.iloc[train_idx].values.ravel()\n    X_local_test = preprocessor.transform(X_train.iloc[valid_idx])\n    y_local_test = y_train.iloc[valid_idx].values.ravel()\n\n    elapsed_time, roc_auc = fit_predict_imbalanced_model(\n        X_local_train, y_local_train, X_local_test, y_local_test\n    )\n    cv_time_imbalanced.append(elapsed_time)\n    cv_results_imbalanced.append(roc_auc)\n\n    elapsed_time, roc_auc = fit_predict_balanced_model(\n        X_local_train, y_local_train, X_local_test, y_local_test\n    )\n    cv_time_balanced.append(elapsed_time)\n    cv_results_balanced.append(roc_auc)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Plot of the results and computation time\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "df_results = pd.DataFrame(\n    {\n        \"Balanced model\": cv_results_balanced,\n        \"Imbalanced model\": cv_results_imbalanced,\n    }\n)\ndf_results = df_results.unstack().reset_index()\n\ndf_time = pd.DataFrame(\n    {\"Balanced model\": cv_time_balanced, \"Imbalanced model\": cv_time_imbalanced}\n)\ndf_time = df_time.unstack().reset_index()\n\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\nplt.figure()\nsns.boxplot(y=\"level_0\", x=0, data=df_time)\nsns.despine(top=True, right=True, left=True)\nplt.xlabel(\"time [s]\")\nplt.ylabel(\"\")\nplt.title(\"Computation time difference using a random under-sampling\")\n\nplt.figure()\nsns.boxplot(y=\"level_0\", x=0, data=df_results, whis=10.0)\nsns.despine(top=True, right=True, left=True)\nax = plt.gca()\nax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: \"%i%%\" % (100 * x)))\nplt.xlabel(\"ROC-AUC\")\nplt.ylabel(\"\")\nplt.title(\"Difference in terms of ROC-AUC using a random under-sampling\")"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.4"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}